In [50]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6
In [76]:
csv_file_path = 'ObesityDataSet_raw_and_data_sinthetic.csv'
csv_df = pd.read_csv(csv_file_path, delimiter=';')
df_csv = pd.DataFrame(csv_df)
df_csv.head()
Out[76]:
| Nombre | Apellido | Genero | Edad | Estatura | Peso | HistorialFamiliar | ConsumeCalorias | ComidasDiarias | AlimentosEntreComidas | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | Femenino | 21 | 1.62 | 64 | yes | no | 3 | Sometimes |
| 1 | Martha | Jones | Femenino | 21 | 1.52 | 56 | yes | no | 3 | Sometimes |
| 2 | David | Curtis | Masculino | 23 | 1.8 | 77 | yes | no | 3 | Sometimes |
| 3 | Crystal | Solis | Femenino | 27 | 1.8 | 87 | no | no | 3 | Sometimes |
| 4 | Stephanie | Vasquez | Femenino | 22 | 1.78 | 89.8 | no | no | 1 | Sometimes |
In [77]:
#Ingreso a la base de datos de Mysql
usuario = 'root'
contraseña = 'jona'
host = 'localhost'
puerto = '3306'
base_de_datos = 'obesidad' # Nombre de la base de datos
In [78]:
# Crear la cadena de conexión
conexion = f'mysql+mysqlconnector://{usuario}:{contraseña}@{host}:{puerto}/{base_de_datos}'
In [79]:
# Crear el motor de conexión
engine = create_engine(conexion)
In [80]:
# Leer datos de la tabla MySQL
tabla_mysql = 'obesidadsql' # Nombre de la tabla
mysql_df = pd.read_sql(tabla_mysql, con=engine)
df_sql = pd.DataFrame(mysql_df)
df_sql.head()
Out[80]:
| Nombre | Apellido | Fuma | Agua | ControlCalorias | FrecuenciaEjercicio | TiempoRecreacion | ConsumoAlcohol | MedioTransporte | Obesidad | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | no | 2 | no | 0 | 1 | no | Public_Transportation | Normal_Weight |
| 1 | Martha | Jones | yes | 3 | yes | 3 | 0 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | David | Curtis | no | 2 | no | 2 | 1 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Crystal | Solis | no | 2 | no | 2 | 0 | Frequently | Walking | Overweight_Level_I |
| 4 | Stephanie | Vasquez | no | 2 | no | 0 | 0 | Sometimes | Public_Transportation | Overweight_Level_II |
In [81]:
#Unir los DataFrames por las columnas 'nombre' y 'apellido'
df_unido = pd.merge(csv_df, mysql_df, on=['Nombre', 'Apellido'], how='inner')
df_unido.head()
Out[81]:
| Nombre | Apellido | Genero | Edad | Estatura | Peso | HistorialFamiliar | ConsumeCalorias | ComidasDiarias | AlimentosEntreComidas | Fuma | Agua | ControlCalorias | FrecuenciaEjercicio | TiempoRecreacion | ConsumoAlcohol | MedioTransporte | Obesidad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | Femenino | 21 | 1.62 | 64 | yes | no | 3 | Sometimes | no | 2 | no | 0 | 1 | no | Public_Transportation | Normal_Weight |
| 1 | Martha | Jones | Femenino | 21 | 1.52 | 56 | yes | no | 3 | Sometimes | yes | 3 | yes | 3 | 0 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | David | Curtis | Masculino | 23 | 1.8 | 77 | yes | no | 3 | Sometimes | no | 2 | no | 2 | 1 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Crystal | Solis | Femenino | 27 | 1.8 | 87 | no | no | 3 | Sometimes | no | 2 | no | 2 | 0 | Frequently | Walking | Overweight_Level_I |
| 4 | Stephanie | Vasquez | Femenino | 22 | 1.78 | 89.8 | no | no | 1 | Sometimes | no | 2 | no | 0 | 0 | Sometimes | Public_Transportation | Overweight_Level_II |
In [82]:
# Verificar y convertir los datos a tipo numérico
df_unido['Edad'] = pd.to_numeric(df_unido['Edad'], errors='coerce')
df_unido['Peso'] = pd.to_numeric(df_unido['Peso'], errors='coerce')
df_unido['Estatura'] = pd.to_numeric(df_unido['Estatura'], errors='coerce')
In [83]:
# Definir funciones para las nuevas columnas
def calculate_bmi(weight, height):
return round(weight / (height ** 2), 2)
def categorize_bmi(bmi):
if bmi < 18.5:
return 'Underweight'
elif 18.5 <= bmi < 24.9:
return 'Normal weight'
elif 25 <= bmi < 29.9:
return 'Overweight'
elif 30 <= bmi < 34.9:
return 'Obesity Class I'
elif 35 <= bmi < 39.9:
return 'Obesity Class II'
else:
return 'Obesity Class III'
def evaluate_physical_activity(exercise_frequency):
if exercise_frequency == 0:
return 'Sedentary'
elif exercise_frequency <= 2:
return 'Moderately active'
else:
return 'Active'
def assess_health_risk(smokes, alcohol_consumption, family_history):
risk = 0
if smokes == 'yes':
risk += 1
if alcohol_consumption in ['Sometimes', 'Frequently']:
risk += 1
if family_history == 'yes':
risk += 1
if risk == 0:
return 'Low'
elif risk == 1:
return 'Moderate'
else:
return 'High'
def hydration_level(water_intake):
if water_intake >= 8:
return 'Well hydrated'
elif 4 <= water_intake < 8:
return 'Adequately hydrated'
else:
return 'Poorly hydrated'
# Añadir las nuevas colunmas al dataframe
df_unido['IMC'] = df_unido.apply(lambda row: calculate_bmi(row['Peso'], row['Estatura']), axis=1)
df_unido['CategoriaIMC'] = df_unido['IMC'].apply(categorize_bmi)
df_unido['ActividadFísica'] = df_unido['FrecuenciaEjercicio'].apply(evaluate_physical_activity)
df_unido['RiesgoSalud'] = df_unido.apply(lambda row: assess_health_risk(row['Fuma'], row['ConsumoAlcohol'], row['HistorialFamiliar']), axis=1)
df_unido['Hidratación'] = df_unido['Agua'].apply(hydration_level)
# Mostrar el dataframe
df_unido.head()
Out[83]:
| Nombre | Apellido | Genero | Edad | Estatura | Peso | HistorialFamiliar | ConsumeCalorias | ComidasDiarias | AlimentosEntreComidas | ... | FrecuenciaEjercicio | TiempoRecreacion | ConsumoAlcohol | MedioTransporte | Obesidad | IMC | CategoriaIMC | ActividadFísica | RiesgoSalud | Hidratación | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Wendy | Greene | Femenino | 21.0 | 1.62 | 64.0 | yes | no | 3 | Sometimes | ... | 0 | 1 | no | Public_Transportation | Normal_Weight | 24.39 | Normal weight | Sedentary | Moderate | Poorly hydrated |
| 1 | Martha | Jones | Femenino | 21.0 | 1.52 | 56.0 | yes | no | 3 | Sometimes | ... | 3 | 0 | Sometimes | Public_Transportation | Normal_Weight | 24.24 | Normal weight | Active | High | Poorly hydrated |
| 2 | David | Curtis | Masculino | 23.0 | 1.80 | 77.0 | yes | no | 3 | Sometimes | ... | 2 | 1 | Frequently | Public_Transportation | Normal_Weight | 23.77 | Normal weight | Moderately active | High | Poorly hydrated |
| 3 | Crystal | Solis | Femenino | 27.0 | 1.80 | 87.0 | no | no | 3 | Sometimes | ... | 2 | 0 | Frequently | Walking | Overweight_Level_I | 26.85 | Overweight | Moderately active | Moderate | Poorly hydrated |
| 4 | Stephanie | Vasquez | Femenino | 22.0 | 1.78 | 89.8 | no | no | 1 | Sometimes | ... | 0 | 0 | Sometimes | Public_Transportation | Overweight_Level_II | 28.34 | Overweight | Sedentary | Moderate | Poorly hydrated |
5 rows × 23 columns
In [84]:
# Guardar el DataFrame unido en un archivo CSV
output_csv_path = 'salida.csv'
df_unido.to_csv(output_csv_path, index=False)
In [85]:
#Visualizaciones con Matplotlib
import matplotlib.pyplot as plt
tabla_contingencia = pd.crosstab(df_unido['Genero'], df_unido['Obesidad'])
tabla_contingencia.plot(kind='bar', figsize=(10, 6))
plt.title('Distribución de Obesidad por Genero')
plt.xlabel('Genero')
plt.ylabel('Peso')
plt.xticks(rotation=0)
plt.legend(title='Obesidad')
plt.grid(True)
plt.show()
In [86]:
# Definir los grupos de estatura
bins = [1.50, 1.60, 1.70, 1.80, 1.90, 2.00]
labels = ['1.50-1.59', '1.60-1.69', '1.70-1.79', '1.80-1.89', '1.90-1.99']
df_unido['GrupoEstatura'] = pd.cut(df_unido['Estatura'], bins=bins, labels=labels, right=False)
# Calcular el peso promedio por grupo de estatura
peso_promedio_por_grupo = df_unido.groupby('GrupoEstatura')['Peso'].mean().reset_index()
# Configurar el tamaño del gráfico
plt.figure(figsize=(10, 6))
# Crear el gráfico de barras
plt.bar(peso_promedio_por_grupo['GrupoEstatura'], peso_promedio_por_grupo['Peso'], color='skyblue')
# Configuración del gráfico
plt.title('Peso Promedio por Grupo de Estatura')
plt.xlabel('Grupo de Estatura (m)')
plt.ylabel('Peso Promedio (kg)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Mostrar el gráfico
plt.show()
C:\Users\Josshy\AppData\Local\Temp\ipykernel_3112\169800126.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
peso_promedio_por_grupo = df_unido.groupby('GrupoEstatura')['Peso'].mean().reset_index()
In [87]:
# Eliminar filas con valores NaN en las columnas numéricas
df_unido = df_unido.dropna(subset=['Edad', 'Estatura', 'Peso'])
In [89]:
# Visualizaciones con Bokeh
# Configurar salida de Bokeh
output_notebook()
# Gráfica 1: Distribución de Categoría de IMC
imc_counts = df_unido['CategoriaIMC'].value_counts()
categories = list(imc_counts.index)
counts = imc_counts.values
p1 = figure(x_range=categories, height=350, title="Distribución de Categorías de IMC",
toolbar_location=None, tools="")
p1.vbar(x=categories, top=counts, width=0.9, line_color='white',
fill_color=factor_cmap('x', palette=Spectral6, factors=categories))
p1.xgrid.grid_line_color = None
p1.y_range.start = 0
p1.yaxis.axis_label = 'Cantidad'
p1.xaxis.axis_label = 'Categoría de IMC'
p1.legend.orientation = "horizontal"
p1.legend.location = "top_center"
# Gráfica 2: Relación entre Edad e IMC
p2 = figure(height=350, title="Relación entre Edad e IMC",
toolbar_location=None, tools="hover", tooltips="Edad: @Edad, IMC: @IMC")
p2.circle('Edad', 'IMC', size=10, source=df, fill_alpha=0.6)
p2.xaxis.axis_label = 'Edad'
p2.yaxis.axis_label = 'IMC'
# Mostrar las gráficas
grid = gridplot([[p1, p2]])
show(grid)
C:\Users\Josshy\AppData\Local\Temp\ipykernel_3112\627053147.py:21: UserWarning: You are attempting to set `plot.legend.orientation` on a plot that has zero legends added, this will have no effect. Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set. p1.legend.orientation = "horizontal" C:\Users\Josshy\AppData\Local\Temp\ipykernel_3112\627053147.py:22: UserWarning: You are attempting to set `plot.legend.location` on a plot that has zero legends added, this will have no effect. Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set. p1.legend.location = "top_center" BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
In [68]:
# Gráficas pywalker
import pygwalker as pyg
# Crear un objeto Pygwalker
pyg.walk(df_unido)
Box(children=(HTML(value='<div id="ifr-pyg-0006198811253430EByRTNe1qrWcjGVI" style="height: auto">\n <head>…
Loading Graphic-Walker UI...
Out[68]:
<pygwalker.api.pygwalker.PygWalker at 0x24e0b6f8940>
In [ ]: